## *************************************
##         A3 Correlation Table       
##         Hao Zhang & Ye Zhang
##             2025/7/24
## *************************************

library(stargazer)
library(readstata13)
library(tidyverse)

# load data
setwd("../raw data")
city <- read.dta13("city panel 1995-2010.dta")
city <- city %>% filter(substr(as.character(city$citycode), 3, 4) != "01")
city1 <- city %>% select(citycode, year, gdp) %>% mutate(year = year + 1)
city <- left_join(city, city1, by = c("citycode", "year"))
city$gdpgrowth <- city$gdp.x/city$gdp.y - 1
city <- city %>% mutate(province = as.integer(citycode/100)) %>% 
  filter(province != 11, province != 12, province != 31, citycode != 5102) %>%
  group_by(province, year) %>%
  mutate(gdpgrowth_mean = mean(gdpgrowth, na.rm = TRUE)) %>% 
  mutate(year = year + 1) %>% filter(gdpgrowth >= -0.2, gdpgrowth <= 0.6) %>% #hist set gdpgrowth < 3
  mutate(gdp_pressure = gdpgrowth_mean - gdpgrowth) %>%
  rename(gdp = gdp.x)

# add tax collection, unemployment rate, and medical resources
city1 <- readxl::read_xlsx("city panel 1996-2014.xlsx") # warnings due to conversion from text to number for the year variable for districts and provinces
citycode <- readxl::read_xlsx("citycode.xlsx") %>%
  mutate(citycode = as.integer(citycode/100))
city1 <- left_join(city1, citycode, by = "city", relationship = "many-to-many") %>% distinct() %>%
  filter(year >= 2001 & year <= 2007)
city <- left_join(city, city1, by = c("citycode", "year"))

city <- city %>% mutate(province = as.integer(citycode/100)) %>% 
  filter(year >= 2001 & year <= 2007) %>%
  mutate(tax = ifelse(province == 33 & year >= 2004 | province == 13 & year >= 2002 | 
                        province == 42 & year >= 2002 | province == 44 & year >= 2002 | 
                        province == 32 & year >= 2005 | province == 53 & year >= 2006, 1, 0))

# create variables
city <- city %>% mutate(capacity = (tax_revenue*100000 - profits)/gdp/10, 
                        deficit = expenditure - revenue_allcity, 
                        unemp_rate = unemployment/population) 

# keep relevant variables
city <- city %>% select(gdp_pressure, gdpgrowth, gdp, deficit, capacity, tax)
city <- city[,-c(1,2)]
city <- city %>% drop_na()
stargazer(cor(city), title = "Correlation Matrix for City Covariates")
